This document provides simple analysis of 5.5 hours of Taiwan Mandarin Spontaneous Speech Corpus.
library(readr)
library(tidyverse)
library(ggplot2)
library(stringr)
library(gridExtra)
The corpus now contains 11 subjects of two age groups of people. The younger group (4M,4F) ages from 20-35, while the older group (2M, 1F) from 50-65. All of them are born and raised in Taipei. Each subject’s TextGrid files were transferred to csv files to have a better view of the word and syllable frequencies in spontaneous speech. This part reads the 11 files and pre-processes the syllable data and word data separately for later analysis. To focus only on the Taiwan Mandarin data, other foreign words, such as Southern Min and English, were not included in the following analysis.
multmerge <- function(mypath){
filenames <- list.files(mypath, pattern = "_f.csv")
age <- c("O", "Y", "Y", "Y", "O", "Y", "Y", "Y", "Y", "O","Y")
gender <- c("M", "M", "F", "M", "M", "M", "F", "F", "F", "F", "M")
datalist <- Map(function(x, y, z){
f <- read_csv(x, col_types = cols(X1 = col_skip()))
f1 <- f %>%
mutate(id = x, age = y, gender = z)
return(f1)}, filenames, age, gender)
Reduce(function(x,y) {bind_rows(x,y)}, datalist)
}
mycorpus <- multmerge("./")
nonchi_syll <- mycorpus$syll_c[str_detect(mycorpus$syll_c,"[a-zA-Z]")]
nonchi_word <- mycorpus$word_c[str_detect(mycorpus$word_c,"[a-zA-Z]")]
syll <- mycorpus %>%
select(syll_e, syll_c, id, age, gender) %>%
filter(!is.na(syll_e)) %>%
filter(!syll_c %in% nonchi_syll)
word <- mycorpus %>%
select(word_e, word_c, id, age, gender) %>%
filter(!is.na(word_e)) %>%
filter(!word_c %in% nonchi_word)
This part presents frequencies in syllable level. There are 82178 tokens of syllable in total.
After filtering the non-chinese syllables, this figure presents the top 20 used syllables with tones out of 1007 types in Taiwan Mandarin spontaneous speech.
syll_e <- syll %>%
group_by(syll_e) %>%
count() %>%
arrange(desc(n))%>%
ungroup()
syll_e1 <- syll_e %>%
rename(freq = n) %>%
rename(s = syll_e) %>%
mutate(s = factor(s, levels = s)) %>%
arrange(desc(freq)) %>%
top_n(20, freq) %>%
mutate(s = fct_reorder(s, freq))
f0 <- ggplot(data = syll_e1, mapping = aes(x = s, y = freq)) +
geom_bar(stat="identity", fill="#96ffa4", alpha=.6, width=.4) +
scale_y_continuous(expand = c(0,0), limit = c(0,3800)) +
coord_flip() +
xlab("") +
theme_bw() +
geom_text(aes(label = freq), alpha = .7, size = 3.5, hjust = 0)
f0
#ggsave("f0.png", dpi=600)
This figure presents the top 20 used syllables without tones out of 404 types in Taiwan Mandarin spontaneous speech.
notone <- str_replace_all(syll$syll_e, "[0-9]", "")
df <- tibble::tibble(syll = notone)
syll_notone <- df %>%
filter(!is.na(syll)) %>%
group_by(syll) %>%
count() %>%
ungroup()
syll_notone1 <- syll_notone %>%
rename(freq = n) %>%
rename(s = syll) %>%
mutate(s = factor(s, levels = s)) %>%
arrange(desc(freq)) %>%
top_n(20, freq) %>%
mutate(s = fct_reorder(s, freq))
f1 <- ggplot(data = syll_notone1, mapping = aes(x = s, y = freq)) +
geom_bar(stat="identity", fill="#ffe100", alpha=.6, width=.4)+
scale_y_continuous(expand = c(0,0), limit = c(0,4800)) +
coord_flip() +
xlab("") +
theme_bw() +
geom_text(aes(label = freq), size = 3.5, alpha = .7, hjust = 0)
f1
#ggsave("f1.png", dpi=600)
This figure presents the tone frequency in a syllable level in Taiwan Mandarin spontaneous speech.
syll_tone <- str_replace_all(syll$syll_e, "[:alpha:]", "")
syll_tone <- syll_tone[!str_detect(syll_tone, "[:blank:]")]
syll_tone <- str_replace_all(syll_tone, "[:blank:]", "")
syll_tone <- str_replace_all(syll_tone, "[:punct:]", "")
df_tone <- tibble::tibble(syll_tone = syll_tone)
df_tone1 <- df_tone %>%
filter(!is.na(syll_tone)) %>%
filter(!syll_tone == "") %>%
group_by(syll_tone) %>%
count() %>%
ungroup()
df_tone1 <- df_tone1 %>%
rename(freq = n) %>%
rename(s = syll_tone) %>%
mutate(s = factor(s, levels = s)) %>%
arrange(desc(freq)) %>%
top_n(5, freq) %>%
mutate(s = fct_reorder(s, freq))
f2 <- ggplot(data = df_tone1, mapping = aes(x = s, y = freq)) +
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
scale_y_continuous(expand = c(0,0), limit = c(0,31000)) +
coord_flip() +
xlab("") +
theme_bw() +
geom_text(aes(label = freq), size = 3.5, alpha = .7, hjust = 0)
f2
#ggsave("f2.png", dpi=600)
This figure presents the top 20 used syllables out of 1911 types in characters in Taiwan Mandarin spontaneous speech.
syll_c <- syll %>%
select(syll_c) %>%
rename(s = syll_c) %>%
group_by(s) %>%
count() %>%
ungroup()
syll_c1 <- syll_c %>%
rename(freq = n) %>%
mutate(s = factor(s, levels = s)) %>%
arrange(desc(freq)) %>%
top_n(20, freq) %>%
mutate(s = fct_reorder(s, freq))
f3 <- ggplot(data = syll_c1, mapping = aes(x = s, y = freq)) +
geom_bar(stat="identity", fill="#73d3ff", alpha=.6, width=.4) +
scale_y_continuous(expand = c(0,0), limit = c(0,3500)) +
coord_flip() +
xlab("") +
theme_bw() +
geom_text(aes(label = freq), size = 3.5, alpha = .7, hjust = 0)
f3
#ggsave("f3.png", dpi=600)
This part presents frequencies in word level. There are 56950 tokens in total.
This figure presents the top 20 used words out of 4934 types in Taiwan Mandarin spontaneous speech.
word_c <- word %>%
group_by(word_c) %>%
count() %>%
ungroup()
word_c1<- word_c %>%
rename(freq = n) %>%
rename(s = word_c) %>%
mutate(s = factor(s, levels = s)) %>%
arrange(desc(freq)) %>%
top_n(20, freq) %>%
mutate(s = fct_reorder(s, freq))
f4 <- ggplot(data = word_c1, mapping = aes(x = s, y = freq)) +
geom_bar(stat="identity", fill="#c2adf0", alpha=.6, width=.4) +
scale_y_continuous(expand = c(0,0), limit = c(0,3000)) +
coord_flip() +
xlab("") +
theme_bw() +
geom_text(aes(label = freq), size = 3.5, alpha = .7, hjust = 0)
f4
#ggsave("f4.png", dpi=600)
This figure presents the top 20 used romanized words out of 4899 types in Taiwan Mandarin spontaneous speech.
word_e <- word %>%
group_by(word_e) %>%
count() %>%
ungroup()
word_e1<- word_e %>%
rename(freq = n) %>%
rename(s = word_e) %>%
mutate(s = factor(s, levels = s)) %>%
arrange(desc(freq)) %>%
top_n(20, freq) %>%
mutate(s = fct_reorder(s, freq))
f5 <- ggplot(data = word_e1, mapping = aes(x = s, y = freq)) +
geom_bar(stat="identity", fill="#f7bce6", alpha=.6, width=.4) +
scale_y_continuous(expand = c(0,0), limit = c(0,3000)) +
coord_flip() +
xlab("") +
theme_bw() +
geom_text(aes(label = freq), size = 3.5, alpha = .7, hjust = 0)
f5
#ggsave("f5.png", dpi=600)
This figure presents the top 20 used romanized words without tones out of 4272 types in Taiwan Mandarin spontaneous speech.
word_notone <- str_replace_all(word$word_e, "[0-9]", "")
df <- tibble::tibble(word_e = word_notone)
df1 <- df %>%
group_by(word_e) %>%
count() %>%
ungroup()
df2 <- df1 %>%
rename(freq = n) %>%
rename(s = word_e) %>%
mutate(s = factor(s, levels = s)) %>%
arrange(desc(freq)) %>%
top_n(20, freq) %>%
mutate(s = fct_reorder(s, freq))
f6 <- ggplot(data = df2, mapping = aes(x = s, y = freq)) +
geom_bar(stat="identity", fill="#5de8d8", alpha=.6, width=.4) +
scale_y_continuous(expand = c(0,0), limit = c(0,3000)) +
coord_flip() +
xlab("") +
theme_bw() +
geom_text(aes(label = freq), size = 3.5, alpha = .7, hjust = 0)
f6
#ggsave("f6.png", dpi=600)
This figure presents the top 15 used tone combination out of 222 types in word level in Taiwan Mandarin spontaneous speech.
tone <- str_replace_all(word$word_e, "[:alpha:]", "")
tone <- str_replace_all(tone, "[:blank:]", "")
tone <- str_replace_all(tone, "[:punct:]", "")
df <- tibble::tibble(tone = tone)
tone_c <- df %>%
filter(!tone == "") %>%
group_by(tone) %>%
count() %>%
ungroup()
tone_c1 <- tone_c %>%
rename(freq = n) %>%
rename(s = tone) %>%
mutate(s = factor(s, levels = s)) %>%
arrange(desc(freq)) %>%
top_n(15, freq) %>%
mutate(s = fct_reorder(s, freq))
f7 <- ggplot(data = tone_c1, mapping = aes(x = s, y = freq)) +
geom_bar(stat="identity", fill="#577bde", alpha=.6, width=.4) +
scale_y_continuous(expand = c(0,0), limit = c(0,13000)) +
coord_flip() +
xlab("") +
theme_bw() +
geom_text(aes(label = freq), size = 3.5, alpha = .7, hjust = 0)
f7
#ggsave("f7.png", dpi=600)
t4 <- df$tone[str_detect(df$tone,"4")]
t3 <- df$tone[str_detect(df$tone,"3")]
t2 <- df$tone[str_detect(df$tone,"2")]
t1 <- df$tone[str_detect(df$tone,"1")]
t0 <- df$tone[str_detect(df$tone,"0")]
tone_w_4 <- df %>%
filter(!tone == "") %>%
filter(tone %in% t4) %>%
mutate(t = df_tone1$s[1])
tone_w_3 <- df %>%
filter(!tone == "") %>%
filter(tone %in% t3) %>%
mutate(t = df_tone1$s[2])
tone_w_1 <- df %>%
filter(!tone == "") %>%
filter(tone %in% t1) %>%
mutate(t = df_tone1$s[3])
tone_w_2 <- df %>%
filter(!tone == "") %>%
filter(tone %in% t2) %>%
mutate(t = df_tone1$s[4])
tone_w_0 <- df %>%
filter(!tone == "") %>%
filter(tone %in% t0) %>%
mutate(t = df_tone1$s[5])
list_tone <- list(tone_w_4,tone_w_3,tone_w_2,tone_w_1,tone_w_0)
tone_p <- Reduce(function(x,y){bind_rows(x,y)}, list_tone)
tone_p1 <- tone_p %>%
mutate(len = str_length(tone)) %>%
filter(len <= 2) %>%
group_by(t, tone) %>%
count() %>%
rename(freq = n)
f8 <- ggplot(data = tone_p1, mapping = aes(x = t, y = freq, fill = tone),
position="stack") +
geom_bar(position="stack", stat="identity", alpha=.6) +
scale_y_continuous(expand = c(0,0), limit = c(0,24000)) +
geom_text(aes(label = tone), size = 4, alpha=.7, position = position_stack(vjust = 0.5))
f8
#ggsave("f8.png", dpi=600, width = 7, height = 5)
POS labels were created by CKIP package on colab.
pos <- read_csv("POS.csv")
word1 <- word %>%
mutate(POS = pos$POS, tone = tone)
C <- c("Caa", "Cbb")
ADV <- c("Da", "Dfa", "Dfb", "D", "Dk")
POST <- c("Cab", "Cba", "Neqb", "Ng")
ASP <- c("Di")
N <- c("Na", "Nb", "Nc", "Nd", "Ncd", "Nh")
DET <- c("Neu", "Nes", "Nep", "Neqa")
M <- c("Nf")
T1 <- c("I", "T", "DE")
Vi <- c("VA", "VB", "VH", "VI")
Vt <- c("VAC", "VC", "VCL", "VD", "VE", "VF", "VG", "VHC", "VJ", "VK", "VL", "SHI", "V_2")
for (i in seq(length(word1$POS))) {
if (word1$POS[i] %in% C) {
word1$POS[i] <- "C"
} else if (word1$POS[i] %in% ADV) {
word1$POS[i] <- "ADV"
} else if (word1$POS[i] %in% POST) {
word1$POS[i] <- "POST"
} else if (word1$POS[i] %in% ASP) {
word1$POS[i] <- "ASP"
} else if (word1$POS[i] %in% N) {
word1$POS[i] <- "N"
} else if (word1$POS[i] %in% DET) {
word1$POS[i] <- "DET"
} else if (word1$POS[i] %in% M) {
word1$POS[i] <- "M"
} else if (word1$POS[i] %in% T1) {
word1$POS[i] <- "T"
} else if (word1$POS[i] %in% Vi) {
word1$POS[i] <- "Vi"
} else if (word1$POS[i] %in% Vt) {
word1$POS[i] <- "Vt"
} else if (word1$POS[i] == "COMMACATEGORY") {
word1$POS[i] <- "ASP"
} else if (word1$POS[i] == "PERIODCATEGORY") {
word1$POS[i] <- "DET"
}
}
##tokens
word_c_POS1 <- word1%>%
select(POS) %>%
mutate(all = length(POS)) %>%
group_by(POS) %>%
summarise(n = n(), p = 100*n/all) %>%
select(-n) %>%
mutate(type = "Tokens", speaker = "All speakers") %>%
distinct()
id_len <- word1 %>%
select(id) %>%
group_by(id) %>%
count() %>%
rename(all = n)
word_c_POS3 <- word1 %>%
select(word_c, id, POS) %>%
mutate(type = "Tokens", speaker = "Average Speaker") %>%
group_by(POS, id, type, speaker) %>%
count()
POS_ave <- right_join(word_c_POS3, id_len, by = "id")
POS_ave1 <- POS_ave %>%
group_by(POS, id, type, speaker) %>%
summarise(p = 100*n/all) %>%
group_by(type, POS, speaker) %>%
summarise(p = mean(p))
##types
pos_type <- word1 %>%
group_by(POS, word_c) %>%
count() %>%
ungroup()
word_c_POS2 <- pos_type %>%
select(POS) %>%
mutate(all = length(POS)) %>%
mutate(type = "Types", speaker = "All speakers") %>%
group_by(POS, type, speaker) %>%
summarise(n = n(), p = 100*n/all) %>%
distinct()
word_c_t<- word1 %>%
group_by(word_c, id, POS) %>%
count() %>%
ungroup()
id_len_t <- word_c_t %>%
select(id) %>%
group_by(id) %>%
count() %>%
rename(all = n)
word_pos_ave <- word_c_t %>%
select(POS, word_c, id) %>%
mutate(type = "Types", speaker = "Average Speaker") %>%
group_by(POS, id, type, speaker) %>%
count()
pos_ave <- right_join(word_pos_ave, id_len_t, by = "id")
pos_ave1 <- pos_ave %>%
group_by(POS, id, type, speaker) %>%
summarise(p = 100*n/all) %>%
group_by(type, POS, speaker) %>%
summarise(p = mean(p))
pos_token <- bind_rows(word_c_POS1, POS_ave1)
pos_type <- bind_rows(word_c_POS2, pos_ave1)
pos_data <- bind_rows(pos_token, pos_type)
pos_data1 <- pos_data %>%
ungroup() %>%
mutate(POS = as.factor(POS)) %>%
arrange(POS) %>%
mutate(POS = fct_reorder(POS, p))
f12 <- ggplot(data = pos_data1, mapping = aes(x = type, y = p, fill = POS),
position = "stack") +
geom_bar(position="stack", stat="identity", width = .5, alpha = .7) +
scale_y_continuous(expand = c(0,0), limit = c(0, 101)) +
geom_text(aes(label = POS), size = 3, alpha=.7, check_overlap = TRUE,
position = position_stack(vjust = 0.5)) +
labs(x = "", y = "Percentage of words in corpus") +
facet_grid(~speaker)
f12
word_tone_POS <- word1 %>%
select(tone, POS) %>%
filter(tone %in% tone_p$tone) %>%
group_by(tone, POS) %>%
count() %>%
filter(n > 10) %>%
arrange(desc(n))
knitr::kable(word_tone_POS[1:50,])
| tone | POS | n |
|---|---|---|
| 0 | T | 4340 |
| 4 | ADV | 3696 |
| 3 | N | 3393 |
| 4 | Vt | 2867 |
| 4 | DET | 2247 |
| 3 | Vt | 1624 |
| 0 | M | 1348 |
| 1 | N | 1289 |
| 3 | ADV | 1285 |
| 4 | P | 1224 |
| 44 | N | 1068 |
| 1 | Vt | 934 |
| 24 | N | 896 |
| 24 | ADV | 815 |
| 2 | ADV | 814 |
| 44 | ADV | 744 |
| 1 | ADV | 717 |
| 4 | Vi | 637 |
| 42 | N | 550 |
| 30 | N | 533 |
| 34 | ADV | 501 |
| 2 | N | 492 |
| 14 | C | 490 |
| 3 | Vi | 475 |
| 1 | DET | 474 |
| 14 | N | 472 |
| 11 | N | 445 |
| 20 | Vt | 445 |
| 43 | N | 437 |
| 44 | Vt | 416 |
| 3 | DET | 412 |
| 4 | N | 399 |
| 32 | N | 397 |
| 33 | C | 397 |
| 41 | N | 395 |
| 44 | Vi | 391 |
| 23 | N | 384 |
| 14 | Vt | 365 |
| 2 | DET | 364 |
| 12 | N | 363 |
| 4 | M | 362 |
| 1 | P | 351 |
| 34 | N | 336 |
| 10 | N | 332 |
| 23 | ADV | 330 |
| 21 | N | 327 |
| 0 | ASP | 297 |
| 2 | Vt | 297 |
| 44 | C | 291 |
| 12 | ADV | 285 |
pos_word <- word1 %>%
select(tone, POS, word_c) %>%
group_by(tone, POS, word_c) %>%
count() %>%
arrange(desc(n))
knitr::kable(pos_word[1:50,])
| tone | POS | word_c | n |
|---|---|---|---|
| 3 | N | 我 | 2707 |
| 0 | T | 的 | 2562 |
| 4 | Vt | 是 | 1647 |
| 4 | DET | 那 | 1508 |
| 0 | M | 個 | 1345 |
| 4 | ADV | 就 | 1214 |
| 3 | Vt | 有 | 833 |
| 1 | N | 他 | 745 |
| 4 | P | 在 | 596 |
| 3 | ADV | 很 | 594 |
| 4 | DET | 這 | 565 |
| 0 | T | 啊 | 552 |
| 24 | ADV | 然後 | 539 |
| 1 | ADV | 都 | 517 |
| 30 | N | 我們 | 497 |
| 3 | ADV | 也 | 493 |
| 14 | C | 因為 | 485 |
| 3 | N | 你 | 476 |
| 4 | ADV | 會 | 450 |
| 20 | Vt | 覺得 | 445 |
| 4 | ADV | 不 | 444 |
| 24 | N | 時候 | 422 |
| 44 | ADV | 就是 | 421 |
| 4 | ADV | 要 | 407 |
| 1 | DET | 一 | 404 |
| 33 | C | 所以 | 396 |
| 0 | T | 呢 | 384 |
| 1 | Vt | 說 | 369 |
| 4 | ADV | 去 | 326 |
| 1 | P | 跟 | 320 |
| 2 | N | 人 | 305 |
| 2 | DET | 一 | 291 |
| 10 | N | 他們 | 266 |
| 0 | ASP | 了 | 262 |
| 20 | DET | 什麼 | 262 |
| 3 | Vi | 好 | 256 |
| 4 | Vi | 對 | 255 |
| 4 | ADV | 那 | 254 |
| 34 | ADV | 比較 | 252 |
| 2 | ADV | 還 | 249 |
| 4 | P | 對 | 248 |
| 44 | N | 現在 | 234 |
| 3 | Vt | 講 | 230 |
| 1 | Vt | 吃 | 228 |
| 0 | T | 啦 | 221 |
| 4 | Vt | 看 | 216 |
| 3 | M | 種 | 204 |
| 23 | ADV | 沒有 | 190 |
| 0 | T | 喔 | 178 |
| 44 | C | 但是 | 174 |
The all speaker vs. average speaker
len <- str_length(word1$word_c)
id_len <- word1 %>%
select(id) %>%
group_by(id) %>%
count() %>%
rename(all = n)
all_tokens <- length(word$word_c)
all_types <- length(word_c$word_c)
## word tokens
word_len_ave <- word1 %>%
select(word_c, id) %>%
mutate(len = len, type = "Tokens", speaker = "Average Speaker") %>%
group_by(len, id, type, speaker) %>%
count()
ave <- right_join(word_len_ave, id_len, by = "id")
ave1 <- ave %>%
group_by(len, id, type, speaker) %>%
summarise(p = n/all) %>%
group_by(type, len, speaker) %>%
summarise(m = mean(p))
word_len_all <- word1 %>%
select(word_c) %>%
mutate(len = len, type = "Tokens", speaker = "All Speakers") %>%
group_by(type, len, speaker) %>%
summarise(n = n(), m = n/all_tokens) %>%
select(-n)
token_all_ave <- bind_rows(word_len_all, ave1)
##word types
word_c_t <- word %>%
group_by(word_c, id) %>%
count() %>%
ungroup()
len_t <- str_length(word_c_t$word_c)
id_len_t <- word_c_t %>%
select(id) %>%
group_by(id) %>%
count() %>%
rename(all = n)
word_type_ave <- word_c_t %>%
select(word_c, id) %>%
mutate(len = len_t, type = "Types", speaker = "Average Speaker") %>%
group_by(len, id, type, speaker) %>%
count()
type_ave <- right_join(word_type_ave, id_len_t, by = "id")
type_ave1 <- type_ave %>%
group_by(len, id, type, speaker) %>%
summarise(p = n/all) %>%
group_by(type, len, speaker) %>%
summarise(m = mean(p))
len_t_all <- str_length(word_c$word_c)
word_type_all <- word_c %>%
select(word_c) %>%
mutate(len = len_t_all, type = "Types", speaker = "All Speakers") %>%
group_by(type, len, speaker) %>%
summarise(n = n(), m = n/all_types) %>%
select(-n)
type_all_ave <- bind_rows(type_ave1, word_type_all)
len_data <- bind_rows(type_all_ave, token_all_ave)
len_data1 <- len_data %>%
ungroup() %>%
mutate(len = as.factor(len), m = 100*m) %>%
arrange(len) %>%
mutate(len = fct_reorder(len, m))
f9 <- ggplot(data = len_data1, mapping = aes(x = type, y = m, fill = len),
position = "stack") +
geom_bar(position="stack", stat="identity", width = .5, alpha = .7) +
scale_y_continuous(expand = c(0,0), limit = c(0, 101)) +
scale_fill_manual(
values = c("royalblue", "skyblue", "blue", "darkblue","navy","black"),
limits = c("1", "2", "3", "4", "5", "6"),
breaks =c("1", "2", "3", "4", "5", "6"),
name = "syll_len", labels = c("1", "2", "3", "4", "5", "6")) +
geom_text(aes(label = round(m)), size = 4, alpha=.7, check_overlap = TRUE,
position = position_stack(vjust = 0.5)) +
labs(x = "", y = "Percentage of words in corpus") +
facet_grid(~speaker)
#ggsave("f9.png", dpi = 600)
f9
##word tokens
len <- str_length(word1$word_c)
word_len_age <- word1 %>%
select(word_c, age, gender) %>%
mutate(len = len) %>%
group_by(age, gender, len) %>%
count()
age_all <- word_len_age %>%
group_by(age, gender) %>%
summarise(all = sum(n))
word_len_age1 <- right_join(word_len_age, age_all, by = c("age", "gender"))
word_len_age2 <- word_len_age1 %>%
mutate(type = "Tokens") %>%
group_by(age, gender, len, type) %>%
summarise(m = n/all)
##word types
word_c_t_a <- word %>%
group_by(word_c, age, gender) %>%
count() %>%
ungroup()
len_t_a <- str_length(word_c_t_a$word_c)
age_len_t <- word_c_t_a %>%
select(age, gender) %>%
group_by(age, gender) %>%
count() %>%
rename(all = n)
word_t_ave <- word_c_t_a %>%
select(word_c, age, gender) %>%
mutate(len = len_t_a, type = "Types") %>%
group_by(len, age, gender, type) %>%
count()
t_ave <- right_join(word_t_ave, age_len_t, by = c("age", "gender"))
t_ave1 <- t_ave %>%
group_by(len, age, gender, type) %>%
summarise(p = n/all) %>%
group_by(age, gender, type, len) %>%
summarise(m = mean(p))
len_age <- bind_rows(word_len_age2, t_ave1)
len_age1 <- len_age %>%
ungroup() %>%
mutate(len = as.factor(len), m = 100*m) %>%
arrange(len) %>%
mutate(len = fct_reorder(len, m))
f10 <- ggplot(data = len_age1, mapping = aes(x = type, y = m, fill = len),
position = "stack") +
geom_bar(position="stack", stat="identity", width = .5, alpha = .7) +
scale_y_continuous(expand = c(0,0), limit = c(0, 101)) +
geom_text(aes(label = round(m)), size = 3, alpha=.7, check_overlap = TRUE,
position = position_stack(vjust = 0.5)) +
labs(x = "", y = "") +
facet_grid(gender~ age)
#ggsave("f10.png", dpi= 600)
f10
len <- str_length(word1$word_c)
word_len_gender <- word1 %>%
select(word_c, gender) %>%
mutate(len = len) %>%
group_by(gender, len) %>%
count()
gender_all <- word_len_gender %>%
group_by(gender) %>%
summarise(all = sum(n))
word_len_gender1 <- right_join(word_len_gender, gender_all, by = "gender")
word_len_gender2 <- word_len_gender1 %>%
mutate(type = "Tokens") %>%
group_by(gender, len, type) %>%
summarise(m = n/all)
##word types
word_c_t_g <- word %>%
group_by(word_c, gender) %>%
count() %>%
ungroup()
len_t_g <- str_length(word_c_t_g$word_c)
gender_len_t <- word_c_t_g %>%
select(gender) %>%
group_by(gender) %>%
count() %>%
rename(all = n)
word_g_ave <- word_c_t_g %>%
select(word_c, gender) %>%
mutate(len = len_t_g, type = "Types") %>%
group_by(len, gender, type) %>%
count()
t_g <- right_join(word_g_ave, gender_len_t, by = "gender")
t_g1 <- t_g %>%
group_by(len, gender, type) %>%
summarise(p = n/all) %>%
group_by(gender, type, len) %>%
summarise(m = mean(p))
len_gender <- bind_rows(word_len_gender2, t_g1)
len_gender1 <- len_gender %>%
ungroup() %>%
mutate(len = as.factor(len), m = 100*m) %>%
arrange(len) %>%
mutate(len = fct_reorder(len, m))
f11 <- ggplot(data = len_gender1, mapping = aes(x = type, y = m, fill = len),
position = "stack") +
geom_bar(position="stack", stat="identity", width = .5, alpha = .7) +
scale_y_continuous(expand = c(0,0), limit = c(0, 101)) +
geom_text(aes(label = round(m)), size = 4, alpha=.7, check_overlap = TRUE,
position = position_stack(vjust = 0.5)) +
labs(x = "", y = "") +
facet_grid(~ gender)
#ggsave("f11.png", dpi = 600)
f11